## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 554925 29.7 1242271 66.4 686457 36.7
## Vcells 1016538 7.8 8388608 64.0 1876640 14.4
library(magrittr)
library(data.table)
library(knitr)
library(ggplot2)
library(ComplexUpset)
`%!in%` = Negate(`%in%`)
`%nin%` = Negate(`%in%`)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ath.gmm = gmm[gmm$plant == 'ath', ]
setDT(ath.gmm)
ath.gmm[, plant := NULL]
ath.gmm[, source := NULL]
colnames(ath.gmm)[2:4] = paste('ath', colnames(ath.gmm)[2:4], sep = '_')note: some duplicated ids in PSS
fp = file.path('..', 'input', 'ath-annot', 'Phytozome', 'PhytozomeV12',
'early_release', 'Athaliana_447_Araport11', 'annotation')
# fn = 'Araport11_GFF3_genes_transposons.current_utf8_attributes_CB.tsv'
fn = 'Athaliana_447_Araport11.geneName.txt'
gn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
colnames(gn)[2] = 'athName'
gn$V1 = sub('\\..*', '', gn$V1)
gn = gn[!duplicated(gn), ]
fn = 'Athaliana_447_Araport11.synonym.txt'
sn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
sn[, merged_column := apply(.SD, 1, function(x) {
# Remove NA and empty strings
x = x[!is.na(x) & x != ""]
paste(x, collapse = " | ")
}), .SDcols = 2:ncol(sn)]
# Optionally, remove the original columns V2 to V15
sn[, (2:(ncol(sn)-1)) := NULL]
colnames(sn)[2] = 'athSynonims'
sn$V1 = sub('\\..*', '', sn$V1)
sn = sn[!duplicated(sn), ]
fp = file.path('..', 'input', 'SKM_2025-07-08')
fn = 'rxn-nodes-public.tsv'
pss = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ind = grep('^name$|^all_pathways|^short_name$', colnames(pss), value = TRUE)
pss = pss[, ..ind]
ind = grep('\\[', pss$name)
pss = pss[ind, ]
pss[, ids_string := stringr::str_extract(name, "(?<=\\[)[^\\]]+(?=\\])")]
pss[, ids_list := strsplit(ids_string, split = ",")]
max_ids = max(lengths(pss$ids_list))
for (i in seq_len(max_ids)) {
pss[[paste0("id_", i)]] = sapply(pss$ids_list, function(x) ifelse(length(x) >= i, x[i], NA))
}
pss[, c("ids_string", "ids_list") := NULL]
pss_long = melt(
pss,
id.vars = c("name", "all_pathways", 'short_name'), # Columns to keep as is
measure.vars = patterns("^id_"), # Columns to melt (all starting with "id_")
variable.name = "id_num", # Name for the melted variable column
value.name = "id" # Name for the melted value column
)
pss_long = pss_long[!is.na(id) & id != ""]
pss_long[, id_num := NULL]
pss_long[, name := NULL]
pss_long$id = sub('\\..*', '', pss_long$id)
pss_long = pss_long[!duplicated(pss_long), ]
table(duplicated(pss_long$id))##
## FALSE TRUE
## 816 24
pss_long %>%
dplyr::filter(id %in% id[duplicated(id)] & stringr::str_starts(id, "^AT")) %>%
dplyr::arrange(id) %>%
print()## all_pathways short_name id
## <char> <char> <char>
## 1: Hormone - Ethylene (ET) ORA59 AT1G06160
## 2: Hormone - Ethylene (ET) ERF/ORA59 AT1G06160
## 3: Hormone - Salicylic acid (SA) TCP8 AT1G58100
## 4: Hormone - Salicylic acid (SA) TCP8,14,15 AT1G58100
## 5: Hormone - Ethylene (ET) EDF2 AT1G68840
## 6: Hormone - Ethylene (ET) ERF/EDF AT1G68840
## 7: Signalling - Heat-shock proteins (HSPs),Stress - Heat HSP70 AT3G12580
## 8: Signalling - Heat-shock proteins (HSPs) HSP AT3G12580
## 9: Hormone - Ethylene (ET) ERF1 AT3G23240
## 10: Hormone - Ethylene (ET) ERF/EDF AT3G23240
## 11: Hormone - Ethylene (ET) ERF6 AT4G17490
## 12: Hormone - Ethylene (ET) ERF/ORA59 AT4G17490
## 13: Hormone - Ethylene (ET) ERF1 AT4G17500
## 14: Hormone - Ethylene (ET) ERF/EDF AT4G17500
## 15: Signalling - Heat-shock proteins (HSPs) MED37E AT5G02500
## 16: Signalling - Heat-shock proteins (HSPs) HSP AT5G02500
## 17: Hormone - Ethylene (ET) ERF096 AT5G43410
## 18: Hormone - Ethylene (ET) ERF/EDF AT5G43410
## 19: Hormone - Ethylene (ET) ERF5 AT5G47230
## 20: Hormone - Ethylene (ET) ERF/ORA59 AT5G47230
## 21: Hormone - Ethylene (ET) ERF105 AT5G51190
## 22: Hormone - Ethylene (ET) ERF/ORA59 AT5G51190
## 23: Signalling - Heat-shock proteins (HSPs) HSP18.1-CI AT5G59720
## 24: Signalling - Heat-shock proteins (HSPs) HSP AT5G59720
## 25: Hormone - Ethylene (ET) ERF104 AT5G61600
## 26: Hormone - Ethylene (ET) ERF/EDF AT5G61600
## all_pathways short_name id
pss_long = pss_long %>%
dplyr::filter(stringr::str_starts(id, "AT")) %>%
dplyr::group_by(id) %>%
dplyr::summarise(
dplyr::across(
.cols = dplyr::everything(),
.fns = ~ {
vals = unique(na.omit(.))
if (length(vals) > 1) paste(vals, collapse = " | ")
else if (length(vals) == 1) vals
else NA_character_
}
),
.groups = "drop"
)
pss_long[pss_long == ""] = "-"
gn[is.na(gn)] = "-"
sn[is.na(sn)] = "-"
ath.annot = pss_long %>%
dplyr::full_join(gn, by = c("id" = "V1")) %>%
dplyr::full_join(sn, by = c("id" = "V1"))Note: be careful with 35.2 bin matches
params_list <- list(
plantName = 'stu'
, plantNameOut = "potato"
, plantDirOut = file.path('..', 'reports', group, "potato")
, flag = 1
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x0000021ba2e1b540>
##
##
## processing file: ./09_translate-child_strict.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-11] | |…… | 11% | |…….. | 15% [unnamed-chunk-12] | |……… | 19% | |……….. | 22% [unnamed-chunk-13] | |…………. | 26% | |…………… | 30% [unnamed-chunk-14] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-15] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-16] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-17] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-18] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-19] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-20] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-21] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-22] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-23] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 15035 75961 17547 51397 23243 33885
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT2G37600 ath PGSC0003DMG400000168 stu FastOMA
## 2 AT3G53740 ath PGSC0003DMG400000168 stu FastOMA
## 3 AT3G23880 ath Sotub12g029980 stu FastOMA
## 4 AT2G13770 ath Sotub12g032820 stu FastOMA
## 5 AT1G01220 ath Soltu.DM.01G035280 stu MCScanX
## 6 AT1G01225 ath Soltu.DM.01G035270 stu MCScanX
## 7 ATMG01080 ath PGSC0003DMG400002202 stu MCScanX
## 8 ATMG01110 ath Soltu.DM.12G017350 stu MCScanX
## 9 AT1G56560 ath Soltu.DM.01G018690 stu OrthoDB
## 10 AT3G06500 ath Soltu.DM.01G018690 stu OrthoDB
## 11 AT5G47320 ath Sotub01g015810 stu OrthoDB
## 12 AT2G28815 ath Soltu.DM.01G010300 stu OrthoDB
## 13 AT1G61070 ath Soltu.DM.01G000020 stu PLAZA
## 14 AT2G02100 ath Soltu.DM.01G000020 stu PLAZA
## 15 AT4G28140 ath PGSC0003DMG400041562 stu PLAZA
## 16 AT3G15353 ath PGSC0003DMG400015318 stu PLAZA
## 17 AT1G01030 ath Soltu.DM.08G005020 stu RBH
## 18 AT1G01030 ath Soltu.DM.08G005030 stu RBH
## 19 ATMG01360 ath Sotub03g026800 stu RBH
## 20 ATMG01360 ath Sotub07g016160 stu RBH
## 21 AT5G01020 ath Soltu.DM.10G018260 stu compara
## 22 AT5G01030 ath Soltu.DM.09G001170 stu compara
## 23 AT1G80950 ath Soltu.DM.04G034970 stu compara
## 24 AT1G80980 ath Soltu.DM.03G031250 stu compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1962885 104.9 3294447 176.0 3126594 167.0
## Vcells 19373211 147.9 31696991 241.9 31627982 241.4
## [1] 23082
## [1] 30489
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 15035 75961 17547 51397 23243 33885
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "PLAZA" "RBH" "compara"
## [9] "from_count" "to_count" "count_evidence" "ath_BINCODE"
## [13] "ath_NAME" "ath_DESCRIPTION" "all_pathways" "short_name"
## [17] "athName" "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE TRUE
## 115350 428
## [1] "PGSC0003DMG400002971" "PGSC0003DMG401011339" "PGSC0003DMG401011339"
## [4] "Soltu.DM.S000060" "Soltu.DM.S000060" "Soltu.DM.S000060"
## [7] "Soltu.DM.S000060" "Soltu.DM.S000060" "Soltu.DM.S000060"
## [10] "Soltu.DM.S000060" "Soltu.DM.S000070" "Soltu.DM.S000070"
## [13] "Soltu.DM.S000100" "Soltu.DM.S000110" "Soltu.DM.S000120"
## [16] "Soltu.DM.S000120" "Soltu.DM.S000140" "Soltu.DM.S000140"
## [19] "Soltu.DM.S000160" "Soltu.DM.S000160" "Soltu.DM.S000170"
## [22] "Soltu.DM.S000170" "Soltu.DM.S000180" "Soltu.DM.S000210"
## [25] "Soltu.DM.S000210" "Soltu.DM.S000240" "Soltu.DM.S000240"
## [28] "Soltu.DM.S000240" "Soltu.DM.S000270" "Soltu.DM.S000280"
## [31] "Soltu.DM.S000290" "Soltu.DM.S000290" "Soltu.DM.S000320"
## [34] "Soltu.DM.S000330" "Soltu.DM.S000360" "Soltu.DM.S000360"
## [37] "Soltu.DM.S000420" "Soltu.DM.S000420" "Soltu.DM.S000430"
## [40] "Soltu.DM.S000430" "Soltu.DM.S000510" "Soltu.DM.S000510"
## [43] "Soltu.DM.S000510" "Soltu.DM.S000510" "Soltu.DM.S000510"
## [46] "Soltu.DM.S000510" "Soltu.DM.S000510" "Soltu.DM.S000510"
## [49] "Soltu.DM.S000510" "Soltu.DM.S000510" "Soltu.DM.S000510"
## [52] "Soltu.DM.S000510" "Soltu.DM.S000510" "Soltu.DM.S000520"
## [55] "Soltu.DM.S000520" "Soltu.DM.S000780" "Soltu.DM.S000780"
## [58] "Soltu.DM.S000830" "Soltu.DM.S000840" "Soltu.DM.S000850"
## [61] "Soltu.DM.S000850" "Soltu.DM.S000850" "Soltu.DM.S000850"
## [64] "Soltu.DM.S000850" "Soltu.DM.S000850" "Soltu.DM.S000850"
## [67] "Soltu.DM.S000850" "Soltu.DM.S000850" "Soltu.DM.S000850"
## [70] "Soltu.DM.S000850" "Soltu.DM.S000850" "Soltu.DM.S000870"
## [73] "Soltu.DM.S000890" "Soltu.DM.S000920" "Soltu.DM.S000950"
## [76] "Soltu.DM.S000980" "Soltu.DM.S000990" "Soltu.DM.S000990"
## [79] "Soltu.DM.S001000" "Soltu.DM.S001000" "Soltu.DM.S001000"
## [82] "Soltu.DM.S001000" "Soltu.DM.S001100" "Soltu.DM.S001120"
## [85] "Soltu.DM.S001120" "Soltu.DM.S001120" "Soltu.DM.S001120"
## [88] "Soltu.DM.S001130" "Soltu.DM.S001130" "Soltu.DM.S001130"
## [91] "Soltu.DM.S001130" "Soltu.DM.S001130" "Soltu.DM.S001130"
## [94] "Soltu.DM.S001130" "Soltu.DM.S001130" "Soltu.DM.S001130"
## [97] "Soltu.DM.S001130" "Soltu.DM.S001130" "Soltu.DM.S001130"
## [100] "Soltu.DM.S001130" "Soltu.DM.S001130" "Soltu.DM.S001130"
## [103] "Soltu.DM.S001270" "Soltu.DM.S001270" "Soltu.DM.S001270"
## [106] "Soltu.DM.S001280" "Soltu.DM.S001300" "Soltu.DM.S001340"
## [109] "Soltu.DM.S001340" "Soltu.DM.S001350" "Soltu.DM.S001470"
## [112] "Soltu.DM.S001470" "Soltu.DM.S001520" "Soltu.DM.S001520"
## [115] "Soltu.DM.S001520" "Soltu.DM.S001520" "Soltu.DM.S001540"
## [118] "Soltu.DM.S001540" "Soltu.DM.S001600" "Soltu.DM.S001650"
## [121] "Soltu.DM.S001650" "Soltu.DM.S001650" "Soltu.DM.S001650"
## [124] "Soltu.DM.S001650" "Soltu.DM.S001650" "Soltu.DM.S001650"
## [127] "Soltu.DM.S001650" "Soltu.DM.S001650" "Soltu.DM.S001650"
## [130] "Soltu.DM.S001650" "Soltu.DM.S001650" "Soltu.DM.S001650"
## [133] "Soltu.DM.S001650" "Soltu.DM.S001650" "Soltu.DM.S001650"
## [136] "Soltu.DM.S001650" "Soltu.DM.S001650" "Soltu.DM.S001650"
## [139] "Soltu.DM.S001650" "Soltu.DM.S001650" "Soltu.DM.S001650"
## [142] "Soltu.DM.S001650" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [145] "Soltu.DM.S001660" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [148] "Soltu.DM.S001660" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [151] "Soltu.DM.S001660" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [154] "Soltu.DM.S001660" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [157] "Soltu.DM.S001660" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [160] "Soltu.DM.S001660" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [163] "Soltu.DM.S001660" "Soltu.DM.S001660" "Soltu.DM.S001660"
## [166] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001670"
## [169] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001670"
## [172] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001670"
## [175] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001670"
## [178] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001670"
## [181] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001670"
## [184] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001670"
## [187] "Soltu.DM.S001670" "Soltu.DM.S001670" "Soltu.DM.S001680"
## [190] "Soltu.DM.S001680" "Soltu.DM.S001680" "Soltu.DM.S001680"
## [193] "Soltu.DM.S001680" "Soltu.DM.S001680" "Soltu.DM.S001680"
## [196] "Soltu.DM.S001680" "Soltu.DM.S001680" "Soltu.DM.S001680"
## [199] "Soltu.DM.S001680" "Soltu.DM.S001680" "Soltu.DM.S001680"
## [202] "Soltu.DM.S001680" "Soltu.DM.S001680" "Soltu.DM.S001680"
## [205] "Soltu.DM.S001680" "Soltu.DM.S001680" "Soltu.DM.S001680"
## [208] "Soltu.DM.S001680" "Soltu.DM.S001680" "Soltu.DM.S001680"
## [211] "Soltu.DM.S001680" "Soltu.DM.S001700" "Soltu.DM.S001700"
## [214] "Soltu.DM.S001700" "Soltu.DM.S001700" "Soltu.DM.S001700"
## [217] "Soltu.DM.S001700" "Soltu.DM.S001700" "Soltu.DM.S001700"
## [220] "Soltu.DM.S001740" "Soltu.DM.S001740" "Soltu.DM.S001740"
## [223] "Soltu.DM.S001740" "Soltu.DM.S001740" "Soltu.DM.S001740"
## [226] "Soltu.DM.S001740" "Soltu.DM.S001740" "Soltu.DM.S001740"
## [229] "Soltu.DM.S001740" "Soltu.DM.S001740" "Soltu.DM.S001740"
## [232] "Soltu.DM.S001740" "Soltu.DM.S001740" "Soltu.DM.S001750"
## [235] "Soltu.DM.S001770" "Soltu.DM.S001810" "Soltu.DM.S001840"
## [238] "Soltu.DM.S001840" "Soltu.DM.S001840" "Soltu.DM.S001840"
## [241] "Soltu.DM.S001840" "Soltu.DM.S001840" "Soltu.DM.S001840"
## [244] "Soltu.DM.S001840" "Soltu.DM.S001840" "Soltu.DM.S001840"
## [247] "Soltu.DM.S001840" "Soltu.DM.S001840" "Soltu.DM.S001840"
## [250] "Soltu.DM.S001840" "Soltu.DM.S001840" "Soltu.DM.S001840"
## [253] "Soltu.DM.S001840" "Soltu.DM.S001840" "Soltu.DM.S001840"
## [256] "Soltu.DM.S001840" "Soltu.DM.S001840" "Soltu.DM.S001840"
## [259] "Soltu.DM.S001840" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [262] "Soltu.DM.S001850" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [265] "Soltu.DM.S001850" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [268] "Soltu.DM.S001850" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [271] "Soltu.DM.S001850" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [274] "Soltu.DM.S001850" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [277] "Soltu.DM.S001850" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [280] "Soltu.DM.S001850" "Soltu.DM.S001850" "Soltu.DM.S001850"
## [283] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001860"
## [286] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001860"
## [289] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001860"
## [292] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001860"
## [295] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001860"
## [298] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001860"
## [301] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001860"
## [304] "Soltu.DM.S001860" "Soltu.DM.S001860" "Soltu.DM.S001870"
## [307] "Soltu.DM.S001870" "Soltu.DM.S001870" "Soltu.DM.S001870"
## [310] "Soltu.DM.S001870" "Soltu.DM.S001870" "Soltu.DM.S001870"
## [313] "Soltu.DM.S001870" "Soltu.DM.S001870" "Soltu.DM.S001870"
## [316] "Soltu.DM.S001870" "Soltu.DM.S001870" "Soltu.DM.S001870"
## [319] "Soltu.DM.S001870" "Soltu.DM.S001870" "Soltu.DM.S001870"
## [322] "Soltu.DM.S001870" "Soltu.DM.S001870" "Soltu.DM.S001870"
## [325] "Soltu.DM.S001870" "Soltu.DM.S001870" "Soltu.DM.S001870"
## [328] "Soltu.DM.S001870" "Soltu.DM.S001970" "Soltu.DM.S001970"
## [331] "Soltu.DM.S001970" "Soltu.DM.S001970" "Soltu.DM.S002030"
## [334] "Soltu.DM.S002030" "Soltu.DM.S002050" "Soltu.DM.S002050"
## [337] "Soltu.DM.S002090" "Soltu.DM.S002100" "Soltu.DM.S002100"
## [340] "Soltu.DM.S002100" "Soltu.DM.S002140" "Soltu.DM.S002160"
## [343] "Soltu.DM.S002160" "Soltu.DM.S002160" "Soltu.DM.S002160"
## [346] "Soltu.DM.S002180" "Soltu.DM.S002180" "Soltu.DM.S002200"
## [349] "Soltu.DM.S002200" "Soltu.DM.S002200" "Soltu.DM.S002200"
## [352] "Soltu.DM.S002210" "Soltu.DM.S002210" "Soltu.DM.S002210"
## [355] "Soltu.DM.S002210" "Soltu.DM.S002220" "Soltu.DM.S002220"
## [358] "Soltu.DM.S002220" "Soltu.DM.S002220" "Soltu.DM.S002230"
## [361] "Soltu.DM.S002230" "Soltu.DM.S002230" "Soltu.DM.S002230"
## [364] "Soltu.DM.S002230" "Soltu.DM.S002230" "Soltu.DM.S002230"
## [367] "Soltu.DM.S002230" "Soltu.DM.S002230" "Soltu.DM.S002230"
## [370] "Soltu.DM.S002230" "Soltu.DM.S002230" "Soltu.DM.S002230"
## [373] "Soltu.DM.S002230" "Soltu.DM.S002260" "Soltu.DM.S002260"
## [376] "Soltu.DM.S002260" "Soltu.DM.S002260" "Soltu.DM.S002260"
## [379] "Soltu.DM.S002260" "Soltu.DM.S002330" "Soltu.DM.S002330"
## [382] "Soltu.DM.S002340" "Soltu.DM.S002350" "Soltu.DM.S002350"
## [385] "Soltu.DM.S002350" "Soltu.DM.S002350" "Soltu.DM.S002350"
## [388] "Soltu.DM.S002350" "Soltu.DM.S002350" "Soltu.DM.S002350"
## [391] "Soltu.DM.S002350" "Soltu.DM.S002420" "Soltu.DM.S002420"
## [394] "Soltu.DM.S002440" "Soltu.DM.S002460" "Soltu.DM.S002490"
## [397] "Soltu.DM.S002490" "Soltu.DM.S002490" "Soltu.DM.S002490"
## [400] "Soltu.DM.S002490" "Soltu.DM.S002490" "Soltu.DM.S002490"
## [403] "Soltu.DM.S002510" "Soltu.DM.S002640" "Soltu.DM.S002640"
## [406] "Soltu.DM.S002650" "Soltu.DM.S002650" "Soltu.DM.S002650"
## [409] "Soltu.DM.S002650" "Soltu.DM.S002650" "Soltu.DM.S002650"
## [412] "Soltu.DM.S002650" "Soltu.DM.S002650" "Soltu.DM.S002650"
## [415] "Soltu.DM.S002650" "Soltu.DM.S002650" "Soltu.DM.S002650"
## [418] "Soltu.DM.S002650" "Soltu.DM.S002650" "Soltu.DM.S002650"
## [421] "Soltu.DM.S002650" "Soltu.DM.S002650" "Soltu.DM.S002650"
## [424] "Soltu.DM.S002650" "Soltu.DM.S002650" "Soltu.DM.S002650"
## [427] "Soltu.DM.S002650" "Soltu.DM.S002650"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 7069392 377.6 13748575 734.3 13748575 734.3
## Vcells 119359765 910.7 188153622 1435.5 188084823 1435.0
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 28662
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 23082
## # stu unigue genes: 30489
## # ath highest connection degree: 88
## # stu highest connection degree: 59
## # genes in ath with degree > 30: 473
## # genes in stu with degree > 30: 310
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 59808 28662 17368 9940
##
## 100% match bad MapMan no match partial match
## 1 30613 17436 15279 7945
## 2 10389 4009 1554 1200
## 3 7039 2383 355 402
## 4 5503 1954 123 230
## 5 4388 1911 51 120
## 6 1876 969 6 43
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt)) # reset if needed
print(table(dt$filter_criteria))##
## reject
## 115778
priority_methods = setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
# rows indices satisfying all conditions; . causes issue!
eligible_rows <- which(
dt$filter_criteria == "reject" &
dt[[method]] == TRUE &
dt$to_geneID %nin% covered_genes &
dt$use == TRUE
)
if(length(eligible_rows) > 0) {
# Update FILTER
dt[eligible_rows, filter_criteria := method]
# Update covered_genes
covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
# Update use
dt[to_geneID %in% covered_genes, use := FALSE]
} else {
cat("No eligible rows found for:", method, "\n")
}
}
# uncovered genes
eligible_rows = which(
dt$filter_criteria == "reject" &
dt$use == TRUE &
grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
# how many special methods are TRUE per row
method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
count_covered = rowSums(method_matrix, na.rm = TRUE)
new_criteria = rep("reject", length(eligible_rows))
# For rows with 2 or 3 methods
for (j in seq_along(eligible_rows)) {
if (count_covered[j] == 3) {
new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
} else if (count_covered[j] == 2) {
covered_by = special_methods[method_matrix[j, ]]
new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
}
}
# update
dt[eligible_rows, filter_criteria := new_criteria]
dt[eligible_rows, use := FALSE]
} else {
print("Nothing here!")
}
# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 24097 8206 2176 1652
##
## 100% match bad MapMan no match partial match
## 1 1706 1279 1179 271
## 2 6389 1177 573 737
## 3 5119 1239 255 271
## 4 4790 1723 113 214
## 5 4217 1819 50 116
## 6 1876 969 6 43
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria,
# levels = c("MCScanX", "compara", "PLAZA",
# "OrthoDB_FastOMA_RBH",
# "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
# "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 18684
## # stu unigue genes: 22686
## # ath highest connection degree: 52
## # stu highest connection degree: 45
## # genes in ath with degree > 30: 2
## # genes in stu with degree > 30: 2
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
# "OrthoDB_FastOMA_RBH",
# "OrthoDB_RBH",
# "FastOMA_OrthoDB",
# "FastOMA_RBH",
# "OrthoDB_MapMan4",
# "RBH_MapMan4",
# "FastOMA_MapMan4
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4",
"OrthoDB_RBH_MapMan4",
"FastOMA_RBH_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 17547
## 2: compara 6740
## 3: PLAZA 6084
## 4: OrthoDB_FastOMA_RBH_MapMan4 1491
## 5: FastOMA_OrthoDB_MapMan4 2503
## 6: OrthoDB_RBH_MapMan4 828
## 7: FastOMA_RBH_MapMan4 938
## 8: reject 79647
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## compara FastOMA_OrthoDB_MapMan4
## 204 98
## FastOMA_RBH_MapMan4 MCScanX
## 58 859
## OrthoDB_FastOMA_RBH_MapMan4 OrthoDB_RBH_MapMan4
## 84 35
## PLAZA reject
## 311 2944
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'sly'
, plantNameOut = "tomato"
, plantDirOut = file.path('..', 'reports', group, "tomato")
, flag = 1
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x0000021c10991738>
##
##
## processing file: ./09_translate-child_strict.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-39] | |…… | 11% | |…….. | 15% [unnamed-chunk-40] | |……… | 19% | |……….. | 22% [unnamed-chunk-41] | |…………. | 26% | |…………… | 30% [unnamed-chunk-42] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-43] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-44] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-45] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-46] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-47] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-48] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-49] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-50] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-51] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 14733 54079 17647 42001 22210 26629
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G06160 ath PRAM_106208 sly FastOMA
## 2 AT2G31230 ath PRAM_106208 sly FastOMA
## 3 AT3G63390 ath Solyc12T002847 sly FastOMA
## 4 AT1G55320 ath Solyc12T002849 sly FastOMA
## 5 AT1G01180 ath Solyc01T003002 sly MCScanX
## 6 AT1G01190 ath Solyc01T002993 sly MCScanX
## 7 ATMG01110 ath Solyc11T001808 sly MCScanX
## 8 ATMG01330 ath Solyc11T001788 sly MCScanX
## 9 AT2G19940 ath Solyc01T004061 sly OrthoDB
## 10 AT3G04650 ath Solyc01T003078 sly OrthoDB
## 11 AT2G07633 ath Solyc11T001073 sly OrthoDB
## 12 AT2G07633 ath Solyc11T001072 sly OrthoDB
## 13 AT5G07610 ath Solyc05T002277 sly PLAZA
## 14 AT4G04330 ath Solyc02T001764 sly PLAZA
## 15 AT3G18160 ath Solyc12T001775 sly PLAZA
## 16 AT3G01510 ath Solyc12T001230 sly PLAZA
## 17 AT1G01030 ath Solyc04T000117 sly RBH
## 18 AT1G01030 ath Solyc08T000375 sly RBH
## 19 ATMG01360 ath Solyc11T001913 sly RBH
## 20 ATMG01410 ath Solyc11T001822 sly RBH
## 21 AT3G20015 ath Solyc04T000286 sly compara
## 22 AT5G66990 ath Solyc02T000133 sly compara
## 23 AT1G69770 ath Solyc12T002848 sly compara
## 24 AT1G55350 ath Solyc12T002850 sly compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4329052 231.2 10998861 587.5 13748576 734.3
## Vcells 95202906 726.4 188153622 1435.5 188136477 1435.4
## [1] 22707
## [1] 23599
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 14733 54079 17647 42001 22210 26629
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "PLAZA" "RBH" "compara"
## [9] "from_count" "to_count" "count_evidence" "ath_BINCODE"
## [13] "ath_NAME" "ath_DESCRIPTION" "all_pathways" "short_name"
## [17] "athName" "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE TRUE
## 83343 980
## [1] "PRAM_106208" "PRAM_106208" "PRAM_106208"
## [4] "PRAM_106208.1" "PRAM_106208.1" "PRAM_106208.1"
## [7] "PRAM_106208.1" "PRAM_106208.1" "PRAM_106382.1"
## [10] "PRAM_116774" "PRAM_116774" "PRAM_121333"
## [13] "PRAM_121333" "PRAM_121333" "PRAM_121333"
## [16] "PRAM_12783" "PRAM_12783" "PRAM_12783"
## [19] "PRAM_12783" "PRAM_128172" "PRAM_128172"
## [22] "PRAM_128172" "PRAM_128172" "PRAM_128172"
## [25] "PRAM_139928.1" "PRAM_139928.1" "PRAM_139928.1.p1"
## [28] "PRAM_139928.1.p1" "PRAM_140960.1.p1" "PRAM_141672.1.p1"
## [31] "PRAM_141691" "PRAM_141691" "PRAM_141691"
## [34] "PRAM_141691" "PRAM_141691" "PRAM_141691"
## [37] "PRAM_141691.1" "PRAM_141691.1" "PRAM_141691.1"
## [40] "PRAM_141691.1" "PRAM_141691.1" "PRAM_141691.1.p1"
## [43] "PRAM_141691.1.p1" "PRAM_141691.1.p1" "PRAM_141691.1.p1"
## [46] "PRAM_141691.1.p1" "PRAM_141691.1.p1" "PRAM_141691.1.p1"
## [49] "PRAM_141869" "PRAM_141869" "PRAM_141869.1"
## [52] "PRAM_141869.1" "PRAM_141870" "PRAM_141870"
## [55] "PRAM_141870" "PRAM_141870" "PRAM_141870"
## [58] "PRAM_141870" "PRAM_141870" "PRAM_141870"
## [61] "PRAM_141870" "PRAM_141870" "PRAM_141870"
## [64] "PRAM_141870.1" "PRAM_141870.1" "PRAM_141870.1"
## [67] "PRAM_141870.1" "PRAM_141870.1" "PRAM_141870.1.p1"
## [70] "PRAM_141870.1.p1" "PRAM_141870.1.p1" "PRAM_141870.1.p1"
## [73] "PRAM_141870.1.p1" "PRAM_141872" "PRAM_141872.1"
## [76] "PRAM_141872.1.p1" "PRAM_141875.1" "PRAM_141875.1"
## [79] "PRAM_141875.1" "PRAM_141875.1" "PRAM_141875.1"
## [82] "PRAM_141875.1" "PRAM_141875.1" "PRAM_141876.1"
## [85] "PRAM_141876.1" "PRAM_141877.1" "PRAM_141877.1.p2"
## [88] "PRAM_15610" "PRAM_15610.1" "PRAM_15610.1.p1"
## [91] "PRAM_15610.1.p1" "PRAM_15612.1" "PRAM_15612.1.p1"
## [94] "PRAM_15614" "PRAM_15614.1.p1" "PRAM_15617"
## [97] "PRAM_15617.1" "PRAM_15617.1.p1" "PRAM_15671"
## [100] "PRAM_15671.1" "PRAM_15671.1.p3" "PRAM_15687.1.p1"
## [103] "PRAM_15687.1.p1" "PRAM_15687.1.p1" "PRAM_15708.1.p1"
## [106] "PRAM_15708.1.p1" "PRAM_15708.1.p1" "PRAM_15758"
## [109] "PRAM_15758.1" "PRAM_15758.1" "PRAM_15758.1"
## [112] "PRAM_15758.1" "PRAM_15758.1" "PRAM_15758.1.p1"
## [115] "PRAM_15758.1.p1" "PRAM_15763" "PRAM_15763.1"
## [118] "PRAM_15773" "PRAM_15773" "PRAM_15773.1"
## [121] "PRAM_15773.1.p1" "PRAM_15781.1" "PRAM_15781.1"
## [124] "PRAM_15781.1" "PRAM_15781.1" "PRAM_15781.1"
## [127] "PRAM_15781.1" "PRAM_15781.1.p1" "PRAM_15821.1"
## [130] "PRAM_15848" "PRAM_15848" "PRAM_15848"
## [133] "PRAM_15848" "PRAM_15848.1" "PRAM_15848.1"
## [136] "PRAM_15848.1" "PRAM_15848.1" "PRAM_15848.1"
## [139] "PRAM_15848.1" "PRAM_15848.1" "PRAM_15848.1"
## [142] "PRAM_15848.1.p1" "PRAM_15848.1.p1" "PRAM_15856"
## [145] "PRAM_15856" "PRAM_15856" "PRAM_15856"
## [148] "PRAM_15856" "PRAM_15856" "PRAM_15856"
## [151] "PRAM_15856" "PRAM_15856" "PRAM_15856"
## [154] "PRAM_15856" "PRAM_15856" "PRAM_15856.1"
## [157] "PRAM_15856.1" "PRAM_15856.1" "PRAM_15856.1"
## [160] "PRAM_15856.1" "PRAM_15856.1" "PRAM_15856.1.p1"
## [163] "PRAM_16203" "PRAM_16203" "PRAM_16203"
## [166] "PRAM_16203" "PRAM_21846" "PRAM_21846.1"
## [169] "PRAM_21846.1" "PRAM_21846.1" "PRAM_21846.1"
## [172] "PRAM_21846.1.p1" "PRAM_21846.1.p1" "PRAM_21846.1.p1"
## [175] "PRAM_21846.1.p1" "PRAM_21846.1.p1" "PRAM_21846.1.p1"
## [178] "PRAM_21934.1" "PRAM_21934.1" "PRAM_21934.1.p1"
## [181] "PRAM_21934.1.p1" "PRAM_22800" "PRAM_22800"
## [184] "PRAM_22800" "PRAM_22800" "PRAM_22800"
## [187] "PRAM_22800" "PRAM_22800" "PRAM_22800.1"
## [190] "PRAM_22800.1" "PRAM_22800.1.p1" "PRAM_22800.1.p1"
## [193] "PRAM_23091" "PRAM_23722" "PRAM_23722"
## [196] "PRAM_23722" "PRAM_23722" "PRAM_23722"
## [199] "PRAM_23722" "PRAM_23722" "PRAM_23722"
## [202] "PRAM_23722" "PRAM_23722.1" "PRAM_23722.1"
## [205] "PRAM_23722.1" "PRAM_23722.1" "PRAM_23722.1.p1"
## [208] "PRAM_23722.1.p1" "PRAM_23722.1.p1" "PRAM_23722.1.p1"
## [211] "PRAM_24043.1" "PRAM_24043.1" "PRAM_24043.1"
## [214] "PRAM_24077" "PRAM_24077" "PRAM_24077"
## [217] "PRAM_24077.1" "PRAM_24077.1" "PRAM_24077.1"
## [220] "PRAM_24077.1.p1" "PRAM_24077.1.p1" "PRAM_24077.1.p1"
## [223] "PRAM_24440.1.p1" "PRAM_24440.1.p1" "PRAM_24440.1.p1"
## [226] "PRAM_24440.1.p1" "PRAM_24440.1.p1" "PRAM_25343"
## [229] "PRAM_25343.1" "PRAM_25343.1" "PRAM_25343.1.p1"
## [232] "PRAM_25343.1.p1" "PRAM_25466" "PRAM_25466"
## [235] "PRAM_25466" "PRAM_25466" "PRAM_25466.1"
## [238] "PRAM_25466.1" "PRAM_25466.1.p1" "PRAM_25466.1.p1"
## [241] "PRAM_25482" "PRAM_25482" "PRAM_25482"
## [244] "PRAM_25482" "PRAM_25482" "PRAM_25482"
## [247] "PRAM_25482" "PRAM_25601" "PRAM_25601"
## [250] "PRAM_25601" "PRAM_25601" "PRAM_25601"
## [253] "PRAM_25601" "PRAM_25601" "PRAM_25601"
## [256] "PRAM_25601.1" "PRAM_25601.1" "PRAM_25601.1.p1"
## [259] "PRAM_25757" "PRAM_25757.1" "PRAM_25757.1.p1"
## [262] "PRAM_25806.1" "PRAM_25806.1" "PRAM_25806.1"
## [265] "PRAM_25806.1.p1" "PRAM_25806.1.p1" "PRAM_25806.1.p1"
## [268] "PRAM_25806.1.p1" "PRAM_25806.1.p1" "PRAM_25847"
## [271] "PRAM_25847.1" "PRAM_25847.1.p1" "PRAM_25853.1"
## [274] "PRAM_25853.1" "PRAM_25853.1" "PRAM_25853.1.p1"
## [277] "PRAM_25853.1.p1" "PRAM_25853.1.p1" "PRAM_25853.1.p1"
## [280] "PRAM_25855.1" "PRAM_25855.1.p1" "PRAM_25858"
## [283] "PRAM_25858" "PRAM_25858.1" "PRAM_25858.1"
## [286] "PRAM_25858.1.p1" "PRAM_25858.1.p1" "PRAM_25864"
## [289] "PRAM_25864" "PRAM_25864.1" "PRAM_25864.1"
## [292] "PRAM_25864.1.p1" "PRAM_25864.1.p1" "PRAM_25866"
## [295] "PRAM_25866" "PRAM_25866.1" "PRAM_25866.1"
## [298] "PRAM_25866.1.p1" "PRAM_25866.1.p1" "PRAM_25885"
## [301] "PRAM_25885" "PRAM_25885.1" "PRAM_25885.1"
## [304] "PRAM_25885.1.p1" "PRAM_25885.1.p1" "PRAM_25896.1"
## [307] "PRAM_25896.1" "PRAM_25896.1" "PRAM_25896.1"
## [310] "PRAM_25896.1" "PRAM_25896.1" "PRAM_25900"
## [313] "PRAM_25900.1" "PRAM_25900.1.p1" "PRAM_25902"
## [316] "PRAM_25902.1" "PRAM_25902.1.p1" "PRAM_25906.1"
## [319] "PRAM_25906.1" "PRAM_25906.1" "PRAM_25906.1"
## [322] "PRAM_25906.1" "PRAM_25906.1" "PRAM_25906.1"
## [325] "PRAM_25906.1" "PRAM_25906.1" "PRAM_25906.1"
## [328] "PRAM_25906.1" "PRAM_25906.1" "PRAM_25906.1"
## [331] "PRAM_25911" "PRAM_25911.1" "PRAM_25911.1"
## [334] "PRAM_25911.1.p1" "PRAM_25921" "PRAM_25921"
## [337] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [340] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [343] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [346] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [349] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [352] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [355] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [358] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [361] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [364] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [367] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [370] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [373] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [376] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [379] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [382] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [385] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [388] "PRAM_25921" "PRAM_25921" "PRAM_25921"
## [391] "PRAM_25921" "PRAM_25921" "PRAM_25921.1"
## [394] "PRAM_25921.1" "PRAM_25921.1.p1" "PRAM_25921.1.p1"
## [397] "PRAM_25923.1" "PRAM_25923.1" "PRAM_25923.1.p1"
## [400] "PRAM_25923.1.p1" "PRAM_25927" "PRAM_25927.1"
## [403] "PRAM_25927.1.p1" "PRAM_25933" "PRAM_25933.1"
## [406] "PRAM_25933.1.p1" "PRAM_25935" "PRAM_25935.1"
## [409] "PRAM_25935.1.p1" "PRAM_25937" "PRAM_25937"
## [412] "PRAM_25937" "PRAM_25937" "PRAM_25937"
## [415] "PRAM_25937" "PRAM_25937" "PRAM_25937"
## [418] "PRAM_25937.1.p1" "PRAM_25937.1.p1" "PRAM_25937.1.p1"
## [421] "PRAM_25937.1.p1" "PRAM_25939" "PRAM_25939.1"
## [424] "PRAM_25939.1.p1" "PRAM_25945" "PRAM_25945.1"
## [427] "PRAM_25945.1.p1" "PRAM_25951" "PRAM_25951"
## [430] "PRAM_25951" "PRAM_25951.1" "PRAM_25951.1"
## [433] "PRAM_25951.1" "PRAM_25951.1.p1" "PRAM_25951.1.p1"
## [436] "PRAM_25951.1.p1" "PRAM_25994" "PRAM_25994.1"
## [439] "PRAM_25994.1.p1" "PRAM_25996" "PRAM_25996"
## [442] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [445] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [448] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [451] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [454] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [457] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [460] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [463] "PRAM_25996" "PRAM_25996" "PRAM_25996"
## [466] "PRAM_25996" "PRAM_25996.1" "PRAM_25996.1"
## [469] "PRAM_25996.1" "PRAM_25996.1" "PRAM_25996.1"
## [472] "PRAM_25996.1" "PRAM_25996.1" "PRAM_25996.1"
## [475] "PRAM_26005.1" "PRAM_26005.1" "PRAM_26007"
## [478] "PRAM_26007.1" "PRAM_26007.1.p1" "PRAM_26016"
## [481] "PRAM_26016.1" "PRAM_26016.1.p1" "PRAM_26019"
## [484] "PRAM_26019" "PRAM_26019" "PRAM_26019"
## [487] "PRAM_26019" "PRAM_26019.1" "PRAM_26019.1.p1"
## [490] "PRAM_26020" "PRAM_26020.1" "PRAM_26020.1.p1"
## [493] "PRAM_26068" "PRAM_26068.1" "PRAM_26068.1.p1"
## [496] "PRAM_26082.1" "PRAM_26082.1" "PRAM_26082.1.p1"
## [499] "PRAM_26082.1.p1" "PRAM_26097" "PRAM_26097"
## [502] "PRAM_26097.1" "PRAM_26097.1.p1" "PRAM_26097.1.p1"
## [505] "PRAM_26101.1" "PRAM_26105" "PRAM_26105.1"
## [508] "PRAM_26105.1.p1" "PRAM_26108" "PRAM_26108"
## [511] "PRAM_26108" "PRAM_26108.1" "PRAM_26108.1.p1"
## [514] "PRAM_26108.1.p1" "PRAM_26108.1.p1" "PRAM_26108.1.p1"
## [517] "PRAM_26109" "PRAM_26109.1" "PRAM_26109.1"
## [520] "PRAM_26109.1.p1" "PRAM_26109.1.p1" "PRAM_26109.1.p1"
## [523] "PRAM_26113" "PRAM_26113" "PRAM_26113"
## [526] "PRAM_26113" "PRAM_26113" "PRAM_26113"
## [529] "PRAM_26113" "PRAM_26113" "PRAM_26113.1"
## [532] "PRAM_26113.1" "PRAM_26113.1" "PRAM_26113.1"
## [535] "PRAM_26113.1" "PRAM_26128" "PRAM_26128.1"
## [538] "PRAM_26128.1.p1" "PRAM_26129" "PRAM_26129.1"
## [541] "PRAM_26129.1" "PRAM_26129.1.p1" "PRAM_26132"
## [544] "PRAM_26132" "PRAM_26132.1" "PRAM_26132.1"
## [547] "PRAM_26132.1.p1" "PRAM_26132.1.p1" "PRAM_26139"
## [550] "PRAM_26139" "PRAM_26139" "PRAM_26139.1"
## [553] "PRAM_26139.1" "PRAM_26139.1" "PRAM_26139.1"
## [556] "PRAM_26139.1.p1" "PRAM_26139.1.p1" "PRAM_26139.1.p1"
## [559] "PRAM_26139.1.p1" "PRAM_26139.1.p1" "PRAM_26145"
## [562] "PRAM_26145" "PRAM_26145.1" "PRAM_26145.1"
## [565] "PRAM_26145.1.p1" "PRAM_26184" "PRAM_26184.1"
## [568] "PRAM_26184.1.p1" "PRAM_26189" "PRAM_26189"
## [571] "PRAM_26189" "PRAM_26189" "PRAM_26189.1"
## [574] "PRAM_26189.1" "PRAM_26189.1" "PRAM_26189.1"
## [577] "PRAM_26189.1" "PRAM_26189.1.p1" "PRAM_26195.1"
## [580] "PRAM_26195.1" "PRAM_26195.1.p1" "PRAM_26195.1.p1"
## [583] "PRAM_26207" "PRAM_26207" "PRAM_26207.1"
## [586] "PRAM_26207.1" "PRAM_26207.1" "PRAM_26207.1"
## [589] "PRAM_26207.1.p1" "PRAM_26211" "PRAM_26211.1"
## [592] "PRAM_26211.1.p1" "PRAM_26212" "PRAM_26212.1"
## [595] "PRAM_26212.1.p1" "PRAM_26216" "PRAM_26216"
## [598] "PRAM_26216" "PRAM_26216" "PRAM_26216.1"
## [601] "PRAM_26216.1" "PRAM_26216.1" "PRAM_26216.1"
## [604] "PRAM_26216.1.p1" "PRAM_26216.1.p1" "PRAM_26216.1.p1"
## [607] "PRAM_26216.1.p1" "PRAM_26218" "PRAM_26218.1"
## [610] "PRAM_26218.1.p1" "PRAM_26220" "PRAM_26220.1"
## [613] "PRAM_26220.1.p1" "PRAM_26242" "PRAM_26242"
## [616] "PRAM_26242.1" "PRAM_26242.1" "PRAM_26242.1"
## [619] "PRAM_26242.1.p1" "PRAM_26242.1.p1" "PRAM_26243.1"
## [622] "PRAM_26243.1.p1" "PRAM_26245" "PRAM_26245"
## [625] "PRAM_26245" "PRAM_26245" "PRAM_26245.1"
## [628] "PRAM_26245.1" "PRAM_26245.1" "PRAM_26245.1"
## [631] "PRAM_26245.1" "PRAM_26245.1" "PRAM_26245.1"
## [634] "PRAM_26245.1" "PRAM_26245.1" "PRAM_26245.1"
## [637] "PRAM_26245.1" "PRAM_26245.1" "PRAM_26245.1"
## [640] "PRAM_26245.1" "PRAM_26247" "PRAM_26247.1"
## [643] "PRAM_26247.1.p1" "PRAM_26248" "PRAM_26248"
## [646] "PRAM_26248" "PRAM_26248" "PRAM_26248.1"
## [649] "PRAM_26248.1" "PRAM_26248.1" "PRAM_26248.1"
## [652] "PRAM_26248.1.p1" "PRAM_26248.1.p1" "PRAM_26248.1.p1"
## [655] "PRAM_26248.1.p1" "PRAM_26252" "PRAM_26252.1"
## [658] "PRAM_26252.1.p1" "PRAM_26255" "PRAM_26255.1"
## [661] "PRAM_26255.1" "PRAM_26255.1" "PRAM_26255.1"
## [664] "PRAM_26255.1" "PRAM_26255.1.p1" "PRAM_26257"
## [667] "PRAM_26257.1" "PRAM_26257.1" "PRAM_26257.1"
## [670] "PRAM_26257.1.p1" "PRAM_26262" "PRAM_26262"
## [673] "PRAM_26262.1" "PRAM_26262.1" "PRAM_26262.1"
## [676] "PRAM_26262.1.p1" "PRAM_26262.1.p1" "PRAM_26263.1"
## [679] "PRAM_26263.1" "PRAM_26263.1" "PRAM_26263.1.p1"
## [682] "PRAM_26263.1.p1" "PRAM_26268.1" "PRAM_26268.1"
## [685] "PRAM_26268.1" "PRAM_26273" "PRAM_26273"
## [688] "PRAM_26273" "PRAM_26273" "PRAM_26273.1"
## [691] "PRAM_26273.1" "PRAM_26273.1" "PRAM_26273.1"
## [694] "PRAM_26276" "PRAM_26276.1" "PRAM_26276.1.p1"
## [697] "PRAM_26279" "PRAM_26279" "PRAM_26279.1"
## [700] "PRAM_26279.1" "PRAM_26279.1" "PRAM_26279.1.p1"
## [703] "PRAM_26279.1.p1" "PRAM_26279.1.p1" "PRAM_26280"
## [706] "PRAM_26280" "PRAM_26280.1" "PRAM_26280.1"
## [709] "PRAM_26280.1.p1" "PRAM_26280.1.p1" "PRAM_26281"
## [712] "PRAM_26281" "PRAM_26281.1" "PRAM_26281.1"
## [715] "PRAM_26281.1" "PRAM_26281.1" "PRAM_26281.1"
## [718] "PRAM_26281.1.p1" "PRAM_26281.1.p1" "PRAM_26296"
## [721] "PRAM_26296.1" "PRAM_26296.1.p2" "PRAM_26298"
## [724] "PRAM_26298.1" "PRAM_26301" "PRAM_26301.1"
## [727] "PRAM_26301.1.p1" "PRAM_26302" "PRAM_26302.1"
## [730] "PRAM_26302.1.p1" "PRAM_26307" "PRAM_26307"
## [733] "PRAM_26307" "PRAM_26307.1" "PRAM_26307.1"
## [736] "PRAM_26307.1.p1" "PRAM_26312" "PRAM_26312.1"
## [739] "PRAM_26312.1" "PRAM_26312.1.p1" "PRAM_26314"
## [742] "PRAM_26314" "PRAM_26314.1" "PRAM_26314.1"
## [745] "PRAM_26314.1.p1" "PRAM_26314.1.p1" "PRAM_26322"
## [748] "PRAM_26322.1" "PRAM_26322.1.p1" "PRAM_26322.1.p1"
## [751] "PRAM_26322.1.p1" "PRAM_26326" "PRAM_26326.1"
## [754] "PRAM_26326.1.p1" "PRAM_26326.1.p1" "PRAM_26326.1.p1"
## [757] "PRAM_26328" "PRAM_26328" "PRAM_26328.1"
## [760] "PRAM_26328.1" "PRAM_26331" "PRAM_26331.1"
## [763] "PRAM_26331.1" "PRAM_26334" "PRAM_26334.1"
## [766] "PRAM_26334.1" "PRAM_26363" "PRAM_26363.1"
## [769] "PRAM_26363.1.p3" "PRAM_267" "PRAM_267"
## [772] "PRAM_267.1" "PRAM_267.1" "PRAM_267.1"
## [775] "PRAM_267.1" "PRAM_267.1.p1" "PRAM_267.1.p1"
## [778] "PRAM_272" "PRAM_272.1" "PRAM_272.1.p1"
## [781] "PRAM_274.1" "PRAM_281" "PRAM_281.1"
## [784] "PRAM_283" "PRAM_283.1" "PRAM_36897"
## [787] "PRAM_36897.1" "PRAM_36897.1.p1" "PRAM_36916"
## [790] "PRAM_36916" "PRAM_36916.1" "PRAM_36916.1"
## [793] "PRAM_36916.1.p1" "PRAM_36916.1.p1" "PRAM_36921"
## [796] "PRAM_36921.1" "PRAM_36921.1" "PRAM_36921.1.p1"
## [799] "PRAM_36926" "PRAM_36926" "PRAM_36926"
## [802] "PRAM_36926" "PRAM_36926" "PRAM_36926.1"
## [805] "PRAM_36926.1" "PRAM_36926.1" "PRAM_36926.1"
## [808] "PRAM_36938" "PRAM_36938.1" "PRAM_36938.1"
## [811] "PRAM_36938.1" "PRAM_36938.1.p1" "PRAM_36938.1.p1"
## [814] "PRAM_36938.1.p1" "PRAM_36967" "PRAM_36967"
## [817] "PRAM_36967" "PRAM_36967" "PRAM_36967.1"
## [820] "PRAM_36967.1" "PRAM_36967.1" "PRAM_36967.1"
## [823] "PRAM_36967.1.p2" "PRAM_36976" "PRAM_36976"
## [826] "PRAM_36976" "PRAM_36976.1" "PRAM_36976.1"
## [829] "PRAM_36976.1" "PRAM_36976.1" "PRAM_36976.1"
## [832] "PRAM_36976.1" "PRAM_36976.1.p1" "PRAM_36976.1.p1"
## [835] "PRAM_36976.1.p1" "PRAM_36977.1" "PRAM_36977.1"
## [838] "PRAM_36977.1" "PRAM_36977.1" "PRAM_36977.1"
## [841] "PRAM_36997" "PRAM_36997" "PRAM_36997"
## [844] "PRAM_36997.1" "PRAM_36997.1" "PRAM_36997.1"
## [847] "PRAM_36997.1.p1" "PRAM_36997.1.p1" "PRAM_36997.1.p1"
## [850] "PRAM_37001" "PRAM_37001" "PRAM_37001.1"
## [853] "PRAM_37001.1" "PRAM_37001.1.p1" "PRAM_37001.1.p1"
## [856] "PRAM_37004" "PRAM_37004.1" "PRAM_37004.1.p1"
## [859] "PRAM_37006" "PRAM_37006.1" "PRAM_37006.1.p1"
## [862] "PRAM_37008" "PRAM_37008" "PRAM_37008"
## [865] "PRAM_37008.1" "PRAM_37008.1" "PRAM_37008.1"
## [868] "PRAM_37008.1.p1" "PRAM_37011" "PRAM_37011"
## [871] "PRAM_37011.1" "PRAM_37011.1" "PRAM_37011.1.p1"
## [874] "PRAM_37011.1.p1" "PRAM_43769.1" "PRAM_43769.1"
## [877] "PRAM_43769.1.p1" "PRAM_448.1" "PRAM_448.1"
## [880] "PRAM_448.1" "PRAM_448.1" "PRAM_448.1"
## [883] "PRAM_448.1" "PRAM_448.1" "PRAM_448.1"
## [886] "PRAM_448.1" "PRAM_448.1" "PRAM_448.1"
## [889] "PRAM_448.1" "PRAM_448.1" "PRAM_448.1"
## [892] "PRAM_448.1" "PRAM_448.1" "PRAM_448.1"
## [895] "PRAM_448.1" "PRAM_448.1" "PRAM_448.1"
## [898] "PRAM_448.1" "PRAM_46974" "PRAM_46974"
## [901] "PRAM_46974" "PRAM_46974" "PRAM_46974"
## [904] "PRAM_46974" "PRAM_46974.1" "PRAM_46974.1"
## [907] "PRAM_46974.1" "PRAM_46974.1" "PRAM_46974.1"
## [910] "PRAM_46974.1" "PRAM_46974.1" "PRAM_46974.1"
## [913] "PRAM_46974.1" "PRAM_46974.1" "PRAM_52553.1"
## [916] "PRAM_52553.1" "PRAM_52553.1.p1" "PRAM_52553.1.p1"
## [919] "PRAM_66596.1.p1" "PRAM_66596.1.p1" "PRAM_66596.1.p1"
## [922] "PRAM_67540.1" "PRAM_67549" "PRAM_67549.1"
## [925] "PRAM_67549.1" "PRAM_67549.1.p1" "PRAM_67549.1.p1"
## [928] "PRAM_73094" "PRAM_73094" "PRAM_73094"
## [931] "PRAM_73094" "PRAM_73094" "PRAM_73094"
## [934] "PRAM_73094" "PRAM_758" "PRAM_758.1"
## [937] "PRAM_758.1.p1" "PRAM_80941" "PRAM_80941"
## [940] "PRAM_80941" "PRAM_80941" "PRAM_80941"
## [943] "PRAM_80941" "PRAM_80941" "PRAM_80941"
## [946] "PRAM_80941" "PRAM_80941" "PRAM_80941"
## [949] "PRAM_80941" "PRAM_80941" "PRAM_80941"
## [952] "PRAM_80941" "PRAM_80941" "PRAM_80941"
## [955] "PRAM_80941" "PRAM_80941" "PRAM_80941"
## [958] "PRAM_80941.1" "PRAM_80941.1" "PRAM_80941.1.p1"
## [961] "PRAM_80945" "PRAM_80945.1" "PRAM_80945.1"
## [964] "PRAM_80945.1.p1" "PRAM_8262" "PRAM_8262"
## [967] "PRAM_8262" "PRAM_85890" "PRAM_85890"
## [970] "PRAM_85890" "PRAM_85890" "PRAM_888"
## [973] "PRAM_888" "PRAM_888" "PRAM_888"
## [976] "PRAM_888" "PRAM_888" "PRAM_888.1"
## [979] "PRAM_891.1" "PRAM_95186.1.p1"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 6052737 323.3 10998861 587.5 13748576 734.3
## Vcells 100323228 765.5 188153622 1435.5 188136477 1435.4
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 18219
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 22707
## # sly unigue genes: 23599
## # ath highest connection degree: 55
## # sly highest connection degree: 58
## # genes in ath with degree > 30: 204
## # genes in sly with degree > 30: 197
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 46257 18219 13068 6779
##
## 100% match bad MapMan no match partial match
## 1 21434 9378 11042 5277
## 2 7504 2496 1238 874
## 3 5370 1714 437 281
## 4 5075 1682 209 159
## 5 4739 1901 108 132
## 6 2135 1048 34 56
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt)) # reset if needed
print(table(dt$filter_criteria))##
## reject
## 84323
priority_methods = setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
# rows indices satisfying all conditions; . causes issue!
eligible_rows <- which(
dt$filter_criteria == "reject" &
dt[[method]] == TRUE &
dt$to_geneID %nin% covered_genes &
dt$use == TRUE
)
if(length(eligible_rows) > 0) {
# Update FILTER
dt[eligible_rows, filter_criteria := method]
# Update covered_genes
covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
# Update use
dt[to_geneID %in% covered_genes, use := FALSE]
} else {
cat("No eligible rows found for:", method, "\n")
}
}
# uncovered genes
eligible_rows = which(
dt$filter_criteria == "reject" &
dt$use == TRUE &
grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
# how many special methods are TRUE per row
method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
count_covered = rowSums(method_matrix, na.rm = TRUE)
new_criteria = rep("reject", length(eligible_rows))
# For rows with 2 or 3 methods
for (j in seq_along(eligible_rows)) {
if (count_covered[j] == 3) {
new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
} else if (count_covered[j] == 2) {
covered_by = special_methods[method_matrix[j, ]]
new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
}
}
# update
dt[eligible_rows, filter_criteria := new_criteria]
dt[eligible_rows, use := FALSE]
} else {
print("Nothing here!")
}
# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 20050 7582 3266 1187
##
## 100% match bad MapMan no match partial match
## 1 1274 1186 1978 216
## 2 3918 962 634 479
## 3 3711 1081 331 179
## 4 4448 1476 187 131
## 5 4564 1829 102 126
## 6 2135 1048 34 56
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria,
# levels = c("MCScanX", "compara", "PLAZA",
# "OrthoDB_FastOMA_RBH",
# "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
# "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 18491
## # sly unigue genes: 20258
## # ath highest connection degree: 24
## # sly highest connection degree: 17
## # genes in ath with degree > 30: 0
## # genes in sly with degree > 30: 0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
# "OrthoDB_FastOMA_RBH",
# "OrthoDB_RBH",
# "FastOMA_OrthoDB",
# "FastOMA_RBH",
# "OrthoDB_MapMan4",
# "RBH_MapMan4",
# "FastOMA_MapMan4
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4",
"OrthoDB_RBH_MapMan4",
"FastOMA_RBH_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 17647
## 2: compara 6150
## 3: PLAZA 5638
## 4: OrthoDB_FastOMA_RBH_MapMan4 567
## 5: FastOMA_OrthoDB_MapMan4 1265
## 6: OrthoDB_RBH_MapMan4 415
## 7: FastOMA_RBH_MapMan4 403
## 8: reject 52238
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## compara FastOMA_OrthoDB_MapMan4
## 184 61
## FastOMA_RBH_MapMan4 MCScanX
## 37 789
## OrthoDB_FastOMA_RBH_MapMan4 OrthoDB_RBH_MapMan4
## 44 17
## PLAZA reject
## 206 2014
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'vvi'
, plantNameOut = "grapevine"
, plantDirOut = file.path('..', 'reports', group, "grapevine")
, flag = 1
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x0000021c38750a48>
##
##
## processing file: ./09_translate-child_strict.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-67] | |…… | 11% | |…….. | 15% [unnamed-chunk-68] | |……… | 19% | |……….. | 22% [unnamed-chunk-69] | |…………. | 26% | |…………… | 30% [unnamed-chunk-70] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-71] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-72] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-73] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-74] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-75] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-76] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-77] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-78] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-79] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 13825 61099 18589 49707 18488 26763
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 ATCG00380 ath Vitvi05_01chr00g00010 vvi FastOMA
## 2 ATCG00780 ath Vitvi05_01chr00g00040 vvi FastOMA
## 3 AT4G20320 ath Vitvi05_01chr19g23100 vvi FastOMA
## 4 AT1G79580 ath Vitvi05_01chr19g23110 vvi FastOMA
## 5 AT1G10350 ath Vitvi05_01chr01g13730 vvi MCScanX
## 6 AT1G10370 ath Vitvi05_01chr01g13610 vvi MCScanX
## 7 ATMG01190 ath Vitvi05_01chr10g19400 vvi MCScanX
## 8 ATMG01280 ath Vitvi05_01chr10g19500 vvi MCScanX
## 9 AT1G44800 ath Vitvi05_01chr18g04300 vvi OrthoDB
## 10 AT1G21890 ath Vitvi05_01chr18g04300 vvi OrthoDB
## 11 AT2G07695 ath Vitvi05_01chr00g07410 vvi OrthoDB
## 12 AT2G07695 ath Vitvi05_01chr10g19710 vvi OrthoDB
## 13 AT3G20250 ath Vitvi05_01chr09g20830 vvi PLAZA
## 14 AT4G25880 ath Vitvi05_01chr09g20830 vvi PLAZA
## 15 AT3G27027 ath Vitvi05_01chr17g01390 vvi PLAZA
## 16 AT3G01940 ath Vitvi05_01chr17g01390 vvi PLAZA
## 17 AT1G01020 ath Vitvi05_01chr10g07040 vvi RBH
## 18 AT1G01030 ath Vitvi05_01chr02g03430 vvi RBH
## 19 ATMG01410 ath Vitvi05_01chr00g07980 vvi RBH
## 20 ATMG01410 ath Vitvi05_01chr10g19280 vvi RBH
## 21 AT2G07785 ath Vitvi05_01chr10g19420 vvi compara
## 22 AT2G07734 ath Vitvi05_01chr00g07530 vvi compara
## 23 AT5G47380 ath Vitvi05_01chr19g16420 vvi compara
## 24 AT1G79390 ath Vitvi05_01chr19g22760 vvi compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4131296 220.7 10998861 587.5 13748576 734.3
## Vcells 88158984 672.6 188153622 1435.5 188136477 1435.4
## [1] 23140
## [1] 24769
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 13825 61099 18589 49707 18488 26763
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "PLAZA" "RBH" "compara"
## [9] "from_count" "to_count" "count_evidence" "ath_BINCODE"
## [13] "ath_NAME" "ath_DESCRIPTION" "all_pathways" "short_name"
## [17] "athName" "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE
## 96387
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 6424627 343.2 10998861 587.5 13748576 734.3
## Vcells 107951188 823.7 188153622 1435.5 188136789 1435.4
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 21558
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 23140
## # vvi unigue genes: 24769
## # ath highest connection degree: 162
## # vvi highest connection degree: 115
## # genes in ath with degree > 30: 372
## # genes in vvi with degree > 30: 239
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 53971 21558 13305 7553
##
## 100% match bad MapMan no match partial match
## 1 29342 11745 11590 6714
## 2 8088 3523 1082 552
## 3 4810 1668 428 127
## 4 4460 1498 109 83
## 5 4562 1839 65 51
## 6 2709 1285 31 26
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt)) # reset if needed
print(table(dt$filter_criteria))##
## reject
## 96387
priority_methods = setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
# rows indices satisfying all conditions; . causes issue!
eligible_rows <- which(
dt$filter_criteria == "reject" &
dt[[method]] == TRUE &
dt$to_geneID %nin% covered_genes &
dt$use == TRUE
)
if(length(eligible_rows) > 0) {
# Update FILTER
dt[eligible_rows, filter_criteria := method]
# Update covered_genes
covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
# Update use
dt[to_geneID %in% covered_genes, use := FALSE]
} else {
cat("No eligible rows found for:", method, "\n")
}
}
# uncovered genes
eligible_rows = which(
dt$filter_criteria == "reject" &
dt$use == TRUE &
grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
# how many special methods are TRUE per row
method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
count_covered = rowSums(method_matrix, na.rm = TRUE)
new_criteria = rep("reject", length(eligible_rows))
# For rows with 2 or 3 methods
for (j in seq_along(eligible_rows)) {
if (count_covered[j] == 3) {
new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
} else if (count_covered[j] == 2) {
covered_by = special_methods[method_matrix[j, ]]
new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
}
}
# update
dt[eligible_rows, filter_criteria := new_criteria]
dt[eligible_rows, use := FALSE]
} else {
print("Nothing here!")
}
# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 20841 7247 2063 814
##
## 100% match bad MapMan no match partial match
## 1 1456 962 1060 211
## 2 4849 912 450 360
## 3 3544 969 362 91
## 4 3924 1352 98 77
## 5 4359 1767 62 49
## 6 2709 1285 31 26
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria,
# levels = c("MCScanX", "compara", "PLAZA",
# "OrthoDB_FastOMA_RBH",
# "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
# "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 19002
## # vvi unigue genes: 19327
## # ath highest connection degree: 73
## # vvi highest connection degree: 18
## # genes in ath with degree > 30: 4
## # genes in vvi with degree > 30: 0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
# "OrthoDB_FastOMA_RBH",
# "OrthoDB_RBH",
# "FastOMA_OrthoDB",
# "FastOMA_RBH",
# "OrthoDB_MapMan4",
# "RBH_MapMan4",
# "FastOMA_MapMan4
"OrthoDB_FastOMA_RBH_MapMan4",
"FastOMA_OrthoDB_MapMan4",
"OrthoDB_RBH_MapMan4",
"FastOMA_RBH_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 18589
## 2: compara 4089
## 3: PLAZA 4650
## 4: OrthoDB_FastOMA_RBH_MapMan4 744
## 5: FastOMA_OrthoDB_MapMan4 1802
## 6: OrthoDB_RBH_MapMan4 706
## 7: FastOMA_RBH_MapMan4 385
## 8: reject 65422
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## compara FastOMA_OrthoDB_MapMan4
## 125 108
## FastOMA_RBH_MapMan4 MCScanX
## 25 754
## OrthoDB_FastOMA_RBH_MapMan4 OrthoDB_RBH_MapMan4
## 67 33
## PLAZA reject
## 238 1593
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'),
asTable = TRUE)# Step 1: params_list
# params_list <- list(
# ...
# )
#
# Step 2: YAML header in 09_fruitTrees-child.Rmd
# ---
# title: "fruitTrees Child"
# output: html_document
# params:
# plantName1: NULL
# plantName2: NULL
# plantName3: NULL
# plantName4: NULL
# plantDirIn: NULL
# plantNameOut: NULL
# plantDirOut: NULL
# pattern_in: NULL
# pattern_out: NULL
# compara_pattern_in1: NULL
# compara_pattern_in2: NULL
# plaza_pattern_in1: NULL
# plaza_pattern_in2: NULL
# ref_genome: NULL
# mercator: NULL
# mercatorPatternIn1: NULL
# mercatorPatternOut1: NULL
# mercatorPatternIn2: NULL
# mercatorPatternOut2: NULL
# ---
#
#
# access params in the script like:
# params$plantName1
# params$plantDirOut
#
# Step 3: Call render() like
# rmarkdown::render(
# input = "09_fruitTrees-child.Rmd",
# params = params_list,
# envir = new.env(parent = globalenv()), # optional: isolate execution
# output_format = "html_document",
# quiet = FALSE
# )
#
#
# This will:
# Inject params_list into params$...
# Knit the child Rmd in a separate process
# Print progress to the console (quiet = FALSE)
# Save an .html file to the working directory## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8
## [2] LC_CTYPE=English_United Kingdom.utf8
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.utf8
##
## time zone: Europe/Ljubljana
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ComplexUpset_1.3.3 ggplot2_3.5.2 knitr_1.50 data.table_1.17.0
## [5] magrittr_2.0.3
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 crayon_1.5.3 dplyr_1.1.4
## [5] compiler_4.4.1 zip_2.3.2 Rcpp_1.0.14 tidyselect_1.2.1
## [9] stringr_1.5.1 dichromat_2.0-0.1 jquerylib_0.1.4 textshaping_1.0.1
## [13] systemfonts_1.2.3 scales_1.4.0 yaml_2.3.10 fastmap_1.2.0
## [17] R6_2.6.1 labeling_0.4.3 generics_0.1.4 patchwork_1.3.0
## [21] igraph_2.1.4 openxlsx_4.2.8 tibble_3.2.1 bslib_0.9.0
## [25] pillar_1.10.2 RColorBrewer_1.1-3 rlang_1.1.5 utf8_1.2.5
## [29] cachem_1.1.0 stringi_1.8.7 xfun_0.52 sass_0.4.10
## [33] cli_3.6.3 withr_3.0.2 digest_0.6.37 grid_4.4.1
## [37] rstudioapi_0.17.1 lifecycle_1.0.4 vctrs_0.6.5 evaluate_1.0.3
## [41] glue_1.8.0 farver_2.1.2 ragg_1.4.0 colorspace_2.1-1
## [45] rmarkdown_2.29 tools_4.4.1 pkgconfig_2.0.3 htmltools_0.5.8.1